Diamonds EDA

Overview:

The aim of this work is to study the affect of other variables on diamond`s price.

Load data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(patchwork)
library(skimr)

diamonds <- read_csv("https://raw.githubusercontent.com/tidyverse/ggplot2/master/data-raw/diamonds.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   carat = col_double(),
##   cut = col_character(),
##   color = col_character(),
##   clarity = col_character(),
##   depth = col_double(),
##   table = col_double(),
##   price = col_double(),
##   x = col_double(),
##   y = col_double(),
##   z = col_double()
## )
cut_levels = c("Fair", "Good", "Very Good", "Premium", "Ideal")
color_levels = c("D", "E", "F", "G", "H", "I", "J")
clarity_levels = c("I1", "SI2", "SI1",  "VS2",  "VS1",  "VVS2", "VVS1", "IF")


diamonds <- diamonds %>% 
  mutate(
    cut = factor(cut, levels = cut_levels, ordered = TRUE),
    color = factor(color, levels  = color_levels, ordered = TRUE),
    clarity = factor(clarity, levels  = clarity_levels, ordered = TRUE))

Quality report (10)

library(skimr)

diamonds %>% 
  skim()
Data summary
Name Piped data
Number of rows 53940
Number of columns 10
_______________________
Column type frequency:
factor 3
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
cut 0 1 TRUE 5 Ide: 21551, Pre: 13791, Ver: 12082, Goo: 4906
color 0 1 TRUE 7 G: 11292, E: 9797, F: 9542, H: 8304
clarity 0 1 TRUE 8 SI1: 13065, VS2: 12258, SI2: 9194, VS1: 8171

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
carat 0 1 0.80 0.47 0.2 0.40 0.70 1.04 5.01 ▇▂▁▁▁
depth 0 1 61.75 1.43 43.0 61.00 61.80 62.50 79.00 ▁▁▇▁▁
table 0 1 57.46 2.23 43.0 56.00 57.00 59.00 95.00 ▁▇▁▁▁
price 0 1 3932.80 3989.44 326.0 950.00 2401.00 5324.25 18823.00 ▇▂▁▁▁
x 0 1 5.73 1.12 0.0 4.71 5.70 6.54 10.74 ▁▁▇▃▁
y 0 1 5.73 1.14 0.0 4.72 5.71 6.54 58.90 ▇▁▁▁▁
z 0 1 3.54 0.71 0.0 2.91 3.53 4.04 31.80 ▇▁▁▁▁

Numeric

  • price
diamonds %>%
  ggplot(aes(x = price)) +
  geom_histogram() +
  scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


  • other
diamonds %>%
  select(where(is.numeric)) %>% 
  mutate(id = row_number()) %>% 
  pivot_longer( cols =  -id) %>% 
  ggplot(aes(x = value)) + 
  geom_histogram() + 
  facet_wrap(~name) + 
  facet_wrap(~name, scales = "free") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

diamonds %>%
  select(where(is.numeric)) %>% 
  mutate(id = row_number()) %>% 
  pivot_longer( cols =  -id)%>% 
  ggplot(aes(x = value)) + 
  geom_boxplot() + 
  facet_wrap(~name) + 
  facet_wrap(~name,
             scales = "free") 


Categoric

diamonds %>%
  select(where(is.factor))  %>% 
  mutate(id = row_number()) %>% 
  pivot_longer(cols = -id, values_ptypes = list(value = 'character')) %>% 
  mutate(value = factor(value, c(cut_levels, color_levels, clarity_levels), ordered = TRUE)) %>% 
  ggplot(aes(y = value)) + 
  geom_bar() + 
  facet_wrap(~name) + 
  facet_wrap(~name, scales = "free") 

Data qualiry plan

feature issue potential soluation comment
price Right skewed take log result is bimodal
carat Right skewed take log
depth Outlier (right-left) is it valid?
table Outlier (right-left) is it valid?
x Outlier (right-left) is it valid?
y Outlier (right-left) is it valid?
z Outlier (right-left) is it valid?
clarity very view of I1,IF
color
cut very view of Fair

Relationship

  • Price vs Carat scatter-plot
price_carat<-diamonds %>% 
  ggplot(aes(x =  carat,
             y = price )) +
  geom_point() +
  ggtitle("before log10")


price_carat_log<-diamonds %>% 
  ggplot(aes(x =  carat,
             y = price )) +
  geom_point() + 
  scale_y_log10()  + 
  scale_x_log10()  +
  ggtitle("after log10")


price_carat + price_carat_log

there is a strong relationship between Carat and price correlation is0.9215913 and after we take the log of both correlation is 0.9659137.

  • Price vs cut
# install.packages("patchwork")
# install.packages("ggridges")
library(ggridges)
library(patchwork)


price_cut_box<-diamonds %>% 
  ggplot(aes(x =  cut,
             y = price)) +
  geom_boxplot() +
  scale_y_log10()


price_cut_dens<- diamonds %>% 
  ggplot(aes(y =  cut,
             x = price)) + 
  geom_density_ridges() +
  scale_x_log10() 

price_cut_box + coord_flip() + price_cut_dens
## Picking joint bandwidth of 0.0613

why as the cut quality increases median price decreases?

ask google

  • price vs cut vs carat.
p1<-diamonds %>% 
  ggplot(aes(y = price, x = carat)) +
  geom_jitter() +
  scale_x_log10() +
  scale_y_log10() 

p2<-diamonds %>% 
  ggplot(aes(y = price, x = cut)) +
  geom_boxplot() +
  scale_y_log10() 
 
p3<-diamonds %>% 
  ggplot(aes(y = carat, x = cut)) +
  geom_boxplot() +
  scale_y_log10() 



p4<- diamonds %>% 
  ggplot(aes(x = log10(carat), y = log10(price), color = cut)) +
  geom_jitter(alpha = .2) +
  geom_smooth() 


(p1 + p2 + p3)/p4
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

diamonds %>% #BREAK
  mutate(carat = cut_width(carat, 1)) %>% #BREAK
  ggplot(aes(x = price,
             fill = carat)) + #BREAK
  geom_histogram() +#BREAK
  scale_x_log10() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

مصفوفة الارتباط


المراجع